/*==============================================================================
FILE NAME: Process_Investigations.do
INPUTS: investigations.dta
OUTPUTS: Investigations_Clean.dta, RN_Investigations.dta, RN_Investigations_air.dta
==============================================================================*/

/* Set directory if working independently through code */
if c(username)=="" { //insert username
    global rootdir "" // insert root path
    global processed_data "$rootdir/processed_data"  // Define global paths for replication package
} 

// Prevent Stata from pausing output
set more off

// Load raw investigations data
use "$processed_data/investigations.dta", clear

// Rename key columns for clarity
rename RegulatedEntityNo RN 
rename CustomerNo CN
rename InvestigationNo IN

// Drop unnecessary columns
drop Address CN CustomerName RegulatedEntityName InvestigationStatus InvestigationStatusDate

// Rename region, city, zip, and county columns for investigations
foreach x in TCEQRegion City ZipCode County {
    rename `x' `x'_IN
}

// Convert zip code column to numeric
destring ZipCode_IN, replace

// Restrict zip codes to valid Texas range
replace ZipCode_IN = . if ZipCode_IN < 73301 
replace ZipCode_IN = . if ZipCode_IN > 88589

// Remove duplicate rows
duplicates drop

// Create investigation indicators
gen investigation = 1
gen investigation_air = 1 if Media == "AIR"
replace investigation_air = 0 if Media == "WATER" | Media == "WASTE"
gen investigation_water = 1 if Media == "WATER"
replace investigation_water = 0 if Media == "AIR" | Media == "WASTE"
gen investigation_waste = 1 if Media == "WASTE"
replace investigation_waste = 0 if Media == "AIR" | Media == "WATER"

// Extract day, month, and year from investigation start date
gen day = day(InvestigationStartDate)
gen month = month(InvestigationStartDate)
gen year = year(InvestigationStartDate)

// Save cleaned investigations data
save "$processed_data/Investigations_Clean.dta", replace

// Create RN-level investigations summary
contract RN 
drop _freq
save "$processed_data/RN_Investigations.dta", replace

// Create RN-level summary for air investigations only
use "$processed_data/Investigations_Clean.dta", clear
keep if investigation_air == 1
contract RN 
drop _freq
save "$processed_data/RN_Investigations_air.dta", replace